# Load packagespkgs <-c("clusterProfiler", "enrichplot", "org.At.tair.db","ggpubr", "here", "tidyverse", "readxl")pacman::p_load(char = pkgs)# Source script with helper functionssource(here("mcic-scripts/rnaseq/rfuns/enrich_funs.R"))source(here("mcic-scripts/rnaseq/rfuns/DE_funs.R"))# Define input filesDE_file <-here("results/tracy/DE.tsv")counts_file <-here("results/DE/norm_mat.tsv") # Created by 'scripts/count_prep.R'meta_file <-here("metadata/meta.tsv") # Created by 'scripts/count_prep.R'annot_file <-here("data/ref/Chinese Spring Genome.xlsx")GO_ORA_file <-here("results/GO/GO_ORA_res.tsv")GO_GSEA_file <-here("results/GO/GO_GSEA_res.tsv")
Code
# Read and prep input files# Metadatameta <-read.delim(meta_file, header =TRUE, colClasses =c("character", rep("factor", 3)))# Gene annotationannot <-read_xlsx(annot_file) |> dplyr::select(gene = locusName,arabi_id =`Best-hit-arabi-name`,arabi_name =`Best-hit-arabi-defline`) |> dplyr::filter(!is.na(arabi_id)) |>distinct(gene, .keep_all =TRUE)annot_heatmap <- annot |> dplyr::select(arabi_id, arabi_name) |>distinct(arabi_id, .keep_all =TRUE) |>column_to_rownames("arabi_id")# DE resultsDE_res <-read_tsv(DE_file, show_col_types =FALSE) |>mutate(isDE =ifelse(padj <0.05, TRUE, FALSE)) |>left_join(annot, by ="gene") |> dplyr::rename(gene_wheat = gene) |>mutate(gene =ifelse(!is.na(arabi_id), arabi_id, gene_wheat))# Normalized count matrix# NOTE: Because Arabidopsis genes often correspond to multiple wheat genes,# in order to plot gene expression levels for genes in certain GO categories,# we'll have to compute means across such 'duplicated' genes# Doing this with the normalized matrix for now -- may not be the best waynorm_mat <-read.delim(counts_file) |>rownames_to_column("gene") |>left_join(annot |> dplyr::select(-arabi_name), by ="gene") |> dplyr::rename(gene_wheat = gene) |>mutate(gene =ifelse(!is.na(arabi_id), arabi_id, gene_wheat)) |>pivot_longer(cols =-c(gene_wheat, arabi_id, gene),names_to ="sample", values_to ="count") |>summarize(count =mean(count), .by =c(gene, sample)) |>pivot_wider(id_cols ="gene", names_from ="sample", values_from ="count") |>column_to_rownames("gene") |>as.matrix()# GO resultsGO_ORA_res <-read_tsv(GO_ORA_file, show_col_types =FALSE)GO_GSEA_res <-read_tsv(GO_GSEA_file, show_col_types =FALSE)# Get a vector with the contrastscontrasts <-unique(DE_res$contrast)
The plots below only show terms for which at least one contrast/DE-direction has p-value below 0.001, to make it more manageable. The numbers in the circles are the number of DEGs annotated with the GO term.
Only show Biological Processes terms for all contrasts and each DE direction:
# Specify a focal contrastfcontrast <- contrasts[2]# Run GSEA for one contrast, and keep gsea format for plotsset.seed(1)GO_GSEA_cp <-run_gsea(contrast = fcontrast, DE_res = DE_res,OrgDb ="org.At.tair.db", keyType ="TAIR",allow_dups =TRUE)
Contrast: XttxopAL1_vs_mock // Nr DE genes: 356 // Nr enriched: 82